suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
Settings
data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'
wd <- "/Users/s-mitsutomi/My Drive (shuheimitsutomi@ric.u-tokyo.ac.jp)/Analysis/METTL2A/"
setwd(wd)
figdir <- paste0(wd, 'Figures/Shortread/Stringtie_tximport_DESeq2/')
tabledir <- paste0(wd, 'Tables/Shortread/')
theme_set(
theme_classic(base_size = 7) +
theme(legend.position = 'bottom')
)
Functions
add_genetype2 <- function(df) {
df |>
mutate(
genetype2 = case_when(
gene_type == 'protein_coding' & seqname == 'chrM' ~ 'mt-mRNA',
gene_type == 'protein_coding' & seqname != 'chrM' ~ 'mRNA',
grepl('MT-RNR', gene_name) & seqname == 'chrM' ~ 'mt-rRNA',
grepl('MT-T', gene_name) & seqname == 'chrM' ~ 'mt-tRNA',
is.na(gene_type) ~ 'unannotated gene',
.default = 'other ncRNAs'
)
)
}
add_isDEG <- function(df) {
df |>
rowwise() |>
mutate(
isUp = case_when(
max(siMETTL2A_G_pvalue, siMETTL2A_I_pvalue) < .05 &
min(siMETTL2A_G_log2FoldChange, siMETTL2A_I_log2FoldChange) > 0
~ 'common',
siMETTL2A_G_pvalue < .05 & siMETTL2A_G_log2FoldChange > 0
~ 'only G',
siMETTL2A_I_pvalue < .05 & siMETTL2A_I_log2FoldChange > 0
~ 'only I',
.default = 'not'),
isDown = case_when(
max(siMETTL2A_G_pvalue, siMETTL2A_I_pvalue) < .05 &
max(siMETTL2A_G_log2FoldChange, siMETTL2A_I_log2FoldChange) < 0
~ 'common',
siMETTL2A_G_pvalue < .05 & siMETTL2A_G_log2FoldChange < 0
~ 'only G',
siMETTL2A_I_pvalue < .05 & siMETTL2A_I_log2FoldChange < 0
~ 'only I',
.default = 'not')
) |>
mutate(
common_DEGs = case_when(
isUp == 'common' ~ 'up',
isDown == 'common' ~ 'down',
.default = 'other'
)
) |>
ungroup()
}
exclude_genes_with_samename <- function(df) {
df |>
filter(!grepl('[|]', gene_id))
}
add_methylation_info <- function(df) {
df |>
left_join(DRS_methylated_genes) |>
replace_na(list(methylation = '-'))
}
Read data
List of methylated genes
DRS_methylated_genes <-
read_tsv(
paste0(wd, 'Tables/DRS_m3C_sites/methylated_positions_2024-03-29.tsv')
) |>
select(gene_id) |>
distinct() |>
mutate(methylation = '+')
## Rows: 632 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (13): transcript_id, kmer, seqname, source, feature, score, strand, fram...
## dbl (3): position, start, end
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
DRS_methylated_genes
## # A tibble: 80 × 2
## gene_id methylation
## <chr> <chr>
## 1 ENSG00000008988.11 +
## 2 ENSG00000086548.9 +
## 3 ENSG00000089009.16 +
## 4 ENSG00000240972.2 +
## 5 ENSG00000026025.16 +
## 6 ENSG00000111640.15 +
## 7 ENSG00000111775.3 +
## 8 ENSG00000112306.8 +
## 9 ENSG00000034510.6 +
## 10 ENSG00000116251.11 +
## # ℹ 70 more rows
espresso_AsPC1_geneinfo <-
read_tsv(
paste0(wd, 'Tables/Espresso_AsPC1_annotation_cleaned_2024-03-29.tsv')
) |>
select(gene_id, gene_name, gene_type, seqname) |>
distinct()
## Rows: 36717 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (12): seqname, source, feature, score, strand, frame, gene_id, transcrip...
## dbl (2): start, end
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_AsPC1_geneinfo
## # A tibble: 13,486 × 4
## gene_id gene_name gene_type seqname
## <chr> <chr> <chr> <chr>
## 1 ENSG00000113851.16 CRBN protein_coding chr3
## 2 ENSG00000072756.18 TRNT1 protein_coding chr3
## 3 ENSG00000170364.13 SETMAR protein_coding chr3
## 4 ENSG00000144455.14 SUMF1 protein_coding chr3
## 5 ENSG00000235978.8 ENSG00000235978 lncRNA chr3
## 6 ENSG00000235831.8 BHLHE40-AS1 lncRNA chr3
## 7 ENSG00000134107.5 BHLHE40 protein_coding chr3
## 8 ENSG00000134108.14 ARL8B protein_coding chr3
## 9 ENSG00000134109.11 EDEM1 protein_coding chr3
## 10 ENSG00000189229.12 ENSG00000189229 lncRNA chr3
## # ℹ 13,476 more rows
DESeq2 result
shortread_stringtie_txi_DESeq2 <-
read_tsv(
paste0(wd, 'Tables/Shortread/shortread_stringtie_txi_DESeq2_2024-04-16.tsv.gz')
)
## Rows: 13043 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): gene_name, gene_id
## dbl (18): siMETTL2A_baseMean, siMETTL2A_log2FoldChange, siMETTL2A_lfcSE, siM...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
shortread_stringtie_txi_DESeq2
## # A tibble: 13,043 × 20
## gene_name siMETTL2A_baseMean siMETTL2A_log2FoldChange siMETTL2A_lfcSE
## <chr> <dbl> <dbl> <dbl>
## 1 7SK 0 NA NA
## 2 A1CF 4720. -0.917 0.0782
## 3 A4GALT 335. 1.89 0.567
## 4 AAAS 3917. -0.298 0.139
## 5 AACS 4911. -0.146 0.337
## 6 AADACP1 61.2 0.932 0.933
## 7 AADAT 15.5 -2.52 1.93
## 8 AAGAB 9386. -0.285 0.145
## 9 AAK1 54.6 0.205 0.632
## 10 AAMDC 1677. 0.763 0.529
## # ℹ 13,033 more rows
## # ℹ 16 more variables: siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>,
## # siMETTL2A_padj <dbl>, siMETTL2A_G_baseMean <dbl>,
## # siMETTL2A_G_log2FoldChange <dbl>, siMETTL2A_G_lfcSE <dbl>,
## # siMETTL2A_G_stat <dbl>, siMETTL2A_G_pvalue <dbl>, siMETTL2A_G_padj <dbl>,
## # siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## # siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, …
Preprocessing
Some genes are —-
shortread_stringtie_txi_DESeq2 |>
select(gene_name, gene_id) |>
filter(grepl('[|]', gene_id))
## # A tibble: 69 × 2
## gene_name gene_id
## <chr> <chr>
## 1 7SK ENSG00000275933.1|ENSG00000273591.1|ENSG00000274303.1|ENSG000…
## 2 AADACP1 ENSG00000291076.1|ENSG00000240602.10
## 3 ABCC13 ENSG00000291052.1|ENSG00000243064.11
## 4 ABCC6P1 ENSG00000291057.1|ENSG00000256340.11
## 5 ABCC6P2 ENSG00000255277.4|ENSG00000290943.1
## 6 AMZ2P1 ENSG00000291140.1|ENSG00000214174.11
## 7 ANAPC1P2 ENSG00000285793.1|ENSG00000231259.5
## 8 ARMCX5-GPRASP2 ENSG00000271147.8|ENSG00000286237.1
## 9 CA5BP1 ENSG00000290746.1|ENSG00000186312.11
## 10 CASTOR3P ENSG00000291122.1|ENSG00000239521.12
## # ℹ 59 more rows
Exclude such genes
shortread_stringtie_txi_DESeq2_DEG_methylation <-
shortread_stringtie_txi_DESeq2 |>
exclude_genes_with_samename() |>
add_methylation_info() |>
add_isDEG() |>
left_join(espresso_AsPC1_geneinfo) |>
add_genetype2()
## Joining with `by = join_by(gene_id)`
## Joining with `by = join_by(gene_name, gene_id)`
shortread_stringtie_txi_DESeq2_DEG_methylation |>
export_tsv(outdir = tabledir, compression = 'gz')
##
## Exported to: /Users/s-mitsutomi/My Drive (shuheimitsutomi@ric.u-tokyo.ac.jp)/Analysis/METTL2A/Tables/Shortread/shortread_stringtie_txi_DESeq2_DEG_methylation_2024-04-17.tsv.gz
## # A tibble: 12,974 × 27
## gene_name siMETTL2A_baseMean siMETTL2A_log2FoldChange siMETTL2A_lfcSE
## <chr> <dbl> <dbl> <dbl>
## 1 A1CF 4720. -0.917 0.0782
## 2 A4GALT 335. 1.89 0.567
## 3 AAAS 3917. -0.298 0.139
## 4 AACS 4911. -0.146 0.337
## 5 AADAT 15.5 -2.52 1.93
## 6 AAGAB 9386. -0.285 0.145
## 7 AAK1 54.6 0.205 0.632
## 8 AAMDC 1677. 0.763 0.529
## 9 AAMP 20411. 0.0309 0.200
## 10 AAR2 4967. -0.106 0.401
## # ℹ 12,964 more rows
## # ℹ 23 more variables: siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>,
## # siMETTL2A_padj <dbl>, siMETTL2A_G_baseMean <dbl>,
## # siMETTL2A_G_log2FoldChange <dbl>, siMETTL2A_G_lfcSE <dbl>,
## # siMETTL2A_G_stat <dbl>, siMETTL2A_G_pvalue <dbl>, siMETTL2A_G_padj <dbl>,
## # siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## # siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, …
Calculate percentage of DEGs
shortread_stringtie_txi_DESeq2_DEG_methylation |>
group_by(methylation) |>
reframe(n = n())
## # A tibble: 2 × 2
## methylation n
## <chr> <int>
## 1 + 79
## 2 - 12895
shortread_stringtie_txi_DESeq2_DEG_genetype_percentage <-
shortread_stringtie_txi_DESeq2_DEG_methylation |>
group_by(common_DEGs, genetype2) |>
reframe(n = n()) |>
group_by(genetype2) |>
mutate(percentage = 100 * n / sum(n)) |>
ungroup()
shortread_stringtie_txi_DESeq2_DEG_genetype_percentage
## # A tibble: 11 × 4
## common_DEGs genetype2 n percentage
## <chr> <chr> <int> <dbl>
## 1 down mRNA 1687 16.5
## 2 down other ncRNAs 118 4.29
## 3 other mRNA 7074 69.3
## 4 other mt-mRNA 3 23.1
## 5 other mt-tRNA 3 42.9
## 6 other other ncRNAs 2404 87.4
## 7 up mRNA 1440 14.1
## 8 up mt-mRNA 10 76.9
## 9 up mt-rRNA 2 100
## 10 up mt-tRNA 4 57.1
## 11 up other ncRNAs 229 8.32
shortread_stringtie_txi_DESeq2_DEG_methylation_percentage <-
shortread_stringtie_txi_DESeq2_DEG_methylation |>
group_by(common_DEGs, methylation) |>
reframe(n = n()) |>
group_by(methylation) |>
mutate(percentage = 100 * n / sum(n)) |>
arrange(methylation) |>
ungroup()
shortread_stringtie_txi_DESeq2_DEG_methylation_percentage
## # A tibble: 6 × 4
## common_DEGs methylation n percentage
## <chr> <chr> <int> <dbl>
## 1 down + 3 3.80
## 2 other + 53 67.1
## 3 up + 23 29.1
## 4 down - 1802 14.0
## 5 other - 9431 73.1
## 6 up - 1662 12.9
shortread_stringtie_txi_DESeq2_DEG_methylation_genetype_percentage <-
shortread_stringtie_txi_DESeq2_DEG_methylation |>
group_by(common_DEGs, methylation, genetype2) |>
reframe(n = n()) |>
group_by(methylation, genetype2) |>
mutate(percentage = 100 * n / sum(n)) |>
arrange(methylation) |>
ungroup()
shortread_stringtie_txi_DESeq2_DEG_methylation_genetype_percentage
## # A tibble: 16 × 5
## common_DEGs methylation genetype2 n percentage
## <chr> <chr> <chr> <int> <dbl>
## 1 down + mRNA 3 4.41
## 2 other + mRNA 52 76.5
## 3 other + mt-mRNA 1 11.1
## 4 up + mRNA 13 19.1
## 5 up + mt-mRNA 8 88.9
## 6 up + mt-rRNA 2 100
## 7 down - mRNA 1684 16.6
## 8 down - other ncRNAs 118 4.29
## 9 other - mRNA 7022 69.3
## 10 other - mt-mRNA 2 50
## 11 other - mt-tRNA 3 42.9
## 12 other - other ncRNAs 2404 87.4
## 13 up - mRNA 1427 14.1
## 14 up - mt-mRNA 2 50
## 15 up - mt-tRNA 4 57.1
## 16 up - other ncRNAs 229 8.32
Plot
shortread_stringtie_txi_DESeq2_DEG_methylation_percentage_barplot <-
shortread_stringtie_txi_DESeq2_DEG_methylation_percentage |>
ggplot(aes(x = methylation, y = percentage, fill = common_DEGs)) +
geom_bar(stat = 'identity') +
scale_y_reverse() +
coord_flip() +
scale_fill_manual(values = c('#3e3ef2', 'grey', '#f23e3e'))
shortread_stringtie_txi_DESeq2_DEG_methylation_percentage_barplot |>
ggsave_multiple_formats(
width = 3.5, height = 2.5, fontsize = 7, outdir = figdir
)

shortread_stringtie_txi_DESeq2_DEG_methylation_genetype_percentage_barplot <-
shortread_stringtie_txi_DESeq2_DEG_methylation_genetype_percentage |>
ggplot(aes(
x = interaction(methylation |> fct_rev(), genetype2 |> fct_rev()),
y = n, fill = common_DEGs
)) +
geom_bar(stat = 'identity', position = position_fill()) +
scale_y_reverse() +
scale_x_discrete(guide = ggh4x::guide_axis_nested(delim = '.')) +
scale_fill_manual(values = c('#3e3ef2', 'grey', '#f23e3e')) +
coord_flip()
shortread_stringtie_txi_DESeq2_DEG_methylation_genetype_percentage_barplot |>
ggsave_multiple_formats(
width = 6, height = 6, fontsize = 7, outdir = figdir
)
## Warning: The S3 guide system was deprecated in ggplot2 3.5.0.
## ℹ It has been replaced by a ggproto system that can be extended.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

shortread_stringtie_txi_DESeq2_DEG_genetype_percentage_barplot <-
shortread_stringtie_txi_DESeq2_DEG_genetype_percentage |>
ggplot(aes(
x = genetype2 |> fct_rev(),
y = percentage, fill = common_DEGs
)) +
geom_bar(stat = 'identity') +
scale_y_reverse() +
coord_flip() +
scale_fill_manual(values = c('#3e3ef2', 'grey', '#f23e3e'))
shortread_stringtie_txi_DESeq2_DEG_genetype_percentage_barplot |>
ggsave_multiple_formats(
width = 5, height = 5, fontsize = 7, outdir = figdir
)
